{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "f = open('ratings.dat', 'r')\n",
    "data = f.readlines()\n",
    "f = open('movies.dat', 'r', encoding='ISO-8859-1')\n",
    "movies = f.readlines()\n",
    "movie_names = [_.split('::')[1] for _ in movies]\n",
    "movie_ids = [_.split('::')[0] for _ in movies]\n",
    "movie_dict = dict(zip(movie_ids, movie_names))\n",
    "id_mapping = dict(zip(movie_ids, range(len(movie_ids))))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "interaction_dicts = dict()\n",
    "for line in data:\n",
    "    user_id, movie_id, rating, timestamp = line.split('::')\n",
    "    if user_id not in interaction_dicts:\n",
    "        interaction_dicts[user_id] = {\n",
    "            'movie_id': [],\n",
    "            'rating': [],\n",
    "            'timestamp': [],\n",
    "            'movie_title': [],\n",
    "        }\n",
    "    interaction_dicts[user_id]['movie_id'].append(movie_id)\n",
    "    interaction_dicts[user_id]['rating'].append(int(float(rating) > 3.0))\n",
    "    interaction_dicts[user_id]['timestamp'].append(timestamp)\n",
    "    interaction_dicts[user_id]['movie_title'].append(movie_dict[movie_id])\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('all.csv', 'w') as f:\n",
    "    import csv\n",
    "    writer = csv.writer(f)\n",
    "    writer.writerow(['user_id', 'item_id', 'rating', 'timestamp', 'item_title'])\n",
    "    for user_id, user_dict in interaction_dicts.items():\n",
    "        writer.writerow([user_id, user_dict['movie_id'], user_dict['rating'], user_dict['timestamp'], user_dict['movie_title']])\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "9301274\n"
     ]
    }
   ],
   "source": [
    "sequential_interaction_list = []\n",
    "seq_len = 10\n",
    "for user_id in interaction_dicts:\n",
    "    temp = zip(interaction_dicts[user_id]['movie_id'], interaction_dicts[user_id]['rating'], interaction_dicts[user_id]['timestamp'], interaction_dicts[user_id]['movie_title'])\n",
    "    temp = sorted(temp, key=lambda x: int(x[2]))\n",
    "    result = zip(*temp)\n",
    "    interaction_dicts[user_id]['movie_id'], interaction_dicts[user_id]['rating'], interaction_dicts[user_id]['timestamp'], interaction_dicts[user_id]['movie_title'] = [list(_) for _ in result]\n",
    "    for i in range(10, len(interaction_dicts[user_id]['movie_id'])):\n",
    "        sequential_interaction_list.append(\n",
    "            [user_id, interaction_dicts[user_id]['movie_title'][i - seq_len: i],interaction_dicts[user_id]['movie_id'][i-seq_len:i], interaction_dicts[user_id]['rating'][i-seq_len:i], interaction_dicts[user_id]['movie_id'][i], interaction_dicts[user_id]['rating'][i], interaction_dicts[user_id]['timestamp'][i].strip('\\n')]\n",
    "        )\n",
    "print(len(sequential_interaction_list))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "import csv\n",
    "sequential_interaction_list = sorted(sequential_interaction_list, key=lambda x: int(x[-1]))\n",
    "with open('./train.csv', 'w') as f:\n",
    "    writer = csv.writer(f)\n",
    "    writer.writerow(['user_id', 'history_movie_title', 'history_movie_id', 'history_rating', 'movie_id', 'rating', 'timestamp'])\n",
    "    writer.writerows(sequential_interaction_list[:int(len(sequential_interaction_list)*0.8)])\n",
    "with open('./valid.csv', 'w') as f:\n",
    "    writer = csv.writer(f)\n",
    "    writer.writerow(['user_id', 'history_movie_title', 'history_movie_id', 'history_rating', 'movie_id', 'rating', 'timestamp'])\n",
    "    writer.writerows(sequential_interaction_list[int(len(sequential_interaction_list)*0.8):int(len(sequential_interaction_list)*0.9)])\n",
    "with open('./test.csv', 'w') as f:\n",
    "    writer = csv.writer(f)\n",
    "    writer.writerow(['user_id', 'history_movie_title', 'history_movie_id', 'history_rating', 'movie_id', 'rating', 'timestamp'])\n",
    "    writer.writerows(sequential_interaction_list[int(len(sequential_interaction_list)*0.9):])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "import pandas as pd\n",
    "import random\n",
    "import numpy as np\n",
    "def csv_to_json(input_path, output_path, sample=False):\n",
    "    data = pd.read_csv(input_path)\n",
    "    if sample:\n",
    "        data = data.sample(n=5000, random_state=42).reset_index(drop=True)\n",
    "        data.to_csv(output_path[:-5] + \".csv\", index=False)\n",
    "    json_list = []\n",
    "    for index, row in data.iterrows():\n",
    "        row['history_movie_id'] = eval(row['history_movie_id'])\n",
    "        row['history_movie_title'] = eval(row['history_movie_title'])\n",
    "        L = len(row['history_movie_id'])\n",
    "        history = \"The user has watched the following movies before:\"\n",
    "        for i in range(L):\n",
    "            if i == 0:\n",
    "                history += \"\\\"\" + row['history_movie_title'][i] + \"\\\"\"\n",
    "            else:\n",
    "                history += \", \\\"\" + row['history_movie_title'][i] + \"\\\"\"\n",
    "        target_movie_name = \"\\\"\" + movie_dict[str(row['movie_id'])] + \"\\\"\"\n",
    "        json_list.append({\n",
    "            \"instruction\": \"Given a list of movies the user has watched before, please recommend a new movie that the user likes to the user.\",\n",
    "            \"input\": f\"{history}\\n \",\n",
    "            \"output\": target_movie_name,\n",
    "        })    \n",
    "        \n",
    "    with open(output_path, 'w') as f:\n",
    "        json.dump(json_list, f, indent=4)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "csv_to_json('./train.csv', './train.json')\n",
    "csv_to_json('./valid.csv', './valid.json')\n",
    "csv_to_json('./test.csv', './test.json')\n",
    "csv_to_json('./valid.csv', './valid_5000.json', sample=True)\n",
    "csv_to_json('./test.csv', './test_5000.json', sample=True)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "alpaca_lora",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.10"
  },
  "orig_nbformat": 4,
  "vscode": {
   "interpreter": {
    "hash": "59fa845e12d05d721e6f4368480cbf49d04f4a649a02e83c3e47bffdee3cc61a"
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
